Load required libraries
library(tidyverse)
library(here)
library(brms)
library(tidybayes)
library(sf)
library(mgcv)
library(priorsense)
library(osmdata)
library(nngeo)
library(glue)
library(arrow)
Import the modelling dataset - cleaned and prepped in the
2025-05-12_aq_clean.Rmd script
aq_model_data <- read_rds("input_data/aq_model_data.rds")
#also load the scale clusters
load("input_data/scale_72_clusters.rda") #SCALE clusters
#and the grid data
mw_100m_cropped <- read_rds("input_data/mw_100m_cropped.rds")
mw_100m_grid_sf <- read_rds("input_data/mw_100m_grid_sf.rds")
get_st_predictions(): A function to take posterior draws
for a prediction matrix, write to an arrow database (to
handle exhausting memory), then summarise
get_st_predictions <- function(model, nd, model_id, out_dir, ndraws = 1000, chunk_size = 10000, who_pm25_limit = 25) {
# Create output folder
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
nd <- nd %>% mutate(.row = row_number())
n_chunks <- ceiling(nrow(nd) / chunk_size)
# Save design matrix to disk for later joins
nd_file <- file.path(out_dir, paste0("nd_", model_id, ".parquet"))
arrow::write_parquet(nd, nd_file)
# Loop over chunks
for (i in seq_len(n_chunks)) {
cat("Processing chunk", i, "of", n_chunks, "\n")
idx <- ((i - 1) * chunk_size + 1):min(i * chunk_size, nrow(nd))
nd_chunk <- nd[idx, ]
epred_array <- posterior_epred(model, newdata = nd_chunk, ndraws = ndraws)
epred_df <- as.data.frame.table(epred_array, responseName = "epred")
colnames(epred_df) <- c("draw", "row_in_chunk", "epred")
epred_df <- epred_df %>%
mutate(
draw = as.integer(draw),
row_in_chunk = as.integer(row_in_chunk),
.row = idx[row_in_chunk],
model_id = model_id
) %>%
select(draw, .row, epred, model_id)
arrow::write_parquet(
epred_df,
sink = file.path(out_dir, glue::glue("epred_chunk_{i}.parquet"))
)
rm(epred_array, epred_df, nd_chunk)
gc()
}
# Read from disk
ds <- arrow::open_dataset(out_dir)
nd_ds <- arrow::open_dataset(nd_file)
summary_df <- ds %>%
mutate(epred_pm25 = exp(epred)) %>%
group_by(.row) %>%
summarise(
mean_log_epred = mean(epred, na.rm = TRUE),
sd_log_epred = sd(epred, na.rm = TRUE),
lwr_log = quantile(epred, 0.025, na.rm = TRUE),
upr_log = quantile(epred, 0.975, na.rm = TRUE),
mean_epred = mean(epred_pm25, na.rm = TRUE),
sd_epred = sd(epred_pm25, na.rm = TRUE),
lwr = quantile(epred_pm25, 0.025, na.rm = TRUE),
upr = quantile(epred_pm25, 0.975, na.rm = TRUE),
prob_exceed = mean(epred_pm25 > who_pm25_limit, na.rm = TRUE),
.groups = "drop"
) %>%
collect() %>%
left_join(nd, by = ".row") %>%
mutate(date = lubridate::month(date, label = TRUE))
return(summary_df)
}
plot_st_predictions(): A function to plot spatiotemporal
predictions from model posteriors
plot_st_predictions <- function(summary_df,
cluster_sf,
value_limits = list(mean = NULL, sd = NULL, prob = c(0, 1))) {
require(ggplot2)
require(viridis)
require(ggdist)
require(dplyr)
require(lubridate)
# Mean prediction plotss
p1 <- summary_df %>%
ggplot() +
geom_tile(aes(x = x, y = y, fill = mean_log_epred)) +
geom_sf(data = cluster_sf, colour = "grey78", fill = NA) +
scale_fill_viridis_c(option = "D", limits = value_limits$mean) +
facet_wrap(~date) +
labs(title = "Posterior mean of log(PM2.5) (µg/m³)", x = "", y = "") +
theme_ggdist() +
theme(panel.background = element_rect(fill = NA, colour = "grey78"),
axis.text.x = element_text(angle = 45, hjust = 1),
strip.background = element_rect(colour = "grey78"))
p2 <- summary_df %>%
ggplot() +
geom_tile(aes(x = x, y = y, fill = mean_epred)) +
geom_sf(data = cluster_sf, colour = "grey78", fill = NA) +
scale_fill_viridis_c(option = "D", limits = value_limits$mean) +
facet_wrap(~date) +
labs(title = "Posterior mean of PM2.5 (µg/m³)", x = "", y = "") +
theme_ggdist() +
theme(panel.background = element_rect(fill = NA, colour = "grey78"),
axis.text.x = element_text(angle = 45, hjust = 1),
strip.background = element_rect(colour = "grey78"))
# Standard deviation plot
p3 <- summary_df %>%
ggplot() +
geom_tile(aes(x = x, y = y, fill = sd_epred)) +
geom_sf(data = cluster_sf, colour = "grey78", fill = NA) +
scale_fill_viridis_c(option = "F", limits = value_limits$sd) +
facet_wrap(~date) +
labs(title = "Posterior SD of PM2.5 (µg/m³)", x = "", y = "") +
theme_ggdist() +
theme(panel.background = element_rect(fill = NA, colour = "grey78"),
axis.text.x = element_text(angle = 45, hjust = 1),
strip.background = element_rect(colour = "grey78"))
# Exceedance probability plot
p4 <- summary_df %>%
ggplot() +
geom_tile(aes(x = x, y = y, fill = prob_exceed)) +
geom_sf(data = cluster_sf, colour = "grey78", fill = NA) +
scale_fill_viridis_c(option = "C", labels = scales::percent_format(), limits = value_limits$prob) +
facet_wrap(~date) +
labs(title = "Pr(PM2.5 > 25 µg/m³)", x = "", y = "") +
theme_ggdist() +
theme(panel.background = element_rect(fill = NA, colour = "grey78"),
axis.text.x = element_text(angle = 45, hjust = 1),
strip.background = element_rect(colour = "grey78"))
list(mean_log_plot = p1, mean_plot = p2, sd_plot = p3, exceed_plot = p4)
}
compute_exposure_metrics() calculates exposure metrics
for the 1 year p
First we will model PM2.5
“We fitted a spatially smooth regression model for log-transformed PM2.5 concentrations using a Gaussian process (GP) smooth over spatial coordinates (x, y) to capture flexible spatial trends. Seasonal variation was modelled using a Fourier series expansion of day-of-year up to the 3rd harmonic (i.e., sine and cosine terms for annual, semi-annual, and tri-annual cycles) to account for complex seasonal patterns. The model was fitted in a Bayesian framework using brms with a Gaussian likelihood and the cmdstanr backend.”
priors <- c(
prior(normal(3.4, 1), class = "Intercept"),
prior(normal(0,1), class = "b"),
prior(exponential(1), class = "sigma"),
prior(exponential(1), class = "sdgp"),
prior(normal(0, 1), class = "lscale", lb = 0)
)
Fit prior only model
m1_prior <- brm(
formula = log_pm2_5 ~ gp(x, y, k=15) +
sin_doy1 + cos_doy1 +
sin_doy2 + cos_doy2 +
sin_doy3 + cos_doy3,
data = aq_model_data,
family = gaussian(),
prior = priors,
sample_prior = "only",
chains = 4, cores = 4,
backend = "cmdstanr"
)
G2;H2;Warningh: The global prior 'normal(0, 1)' of class 'lscale' will not be used in the model as all related coefficients have individual priors already. If you did not set those priors yourself, then maybe brms has assigned default priors. See ?set_prior and ?default_prior for more details.g
G3;Model executable is up to date!
gG3;Start sampling
g
Running MCMC with 4 parallel chains...
Chain 1 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 1 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 1 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 1 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 1 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 1 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 1 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 2 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 2 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 2 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 2 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 1 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 1 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 1 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 1 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 1 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 1 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 1 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 1 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 1 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 1 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 1 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 1 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 2 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 2 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 2 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 2 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 2 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 2 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 2 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 2 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 2 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 2 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 2 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 2 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 2 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 3 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 3 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 3 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 3 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 3 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 3 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 3 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 3 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 3 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 3 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 3 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 3 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 3 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 3 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 3 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 4 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 4 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 4 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 4 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 4 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 4 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 4 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 4 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 4 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 4 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 4 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 4 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 4 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 1 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 1 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 1 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 2 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 2 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 2 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 2 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 2 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 3 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 3 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 3 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 3 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 3 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 3 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 3 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 4 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 4 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 4 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 4 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 4 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 4 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 4 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 4 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 4 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 1 finished in 0.2 seconds.
Chain 2 finished in 0.2 seconds.
Chain 3 finished in 0.2 seconds.
Chain 4 finished in 0.2 seconds.
All 4 chains finished successfully.
Mean chain execution time: 0.2 seconds.
Total execution time: 0.4 seconds.
# summary(m1_prior)
# plot(m1_prior)
Now model with data.
m1 <- brm(
formula = log_pm2_5 ~ gp(x, y, k=15) +
sin_doy1 + cos_doy1 +
sin_doy2 + cos_doy2 +
sin_doy3 + cos_doy3,
data = aq_model_data,
family = gaussian(),
prior = priors,
chains = 4, cores = 4,
backend = "cmdstanr"
)
G2;H2;Warningh: The global prior 'normal(0, 1)' of class 'lscale' will not be used in the model as all related coefficients have individual priors already. If you did not set those priors yourself, then maybe brms has assigned default priors. See ?set_prior and ?default_prior for more details.g
G3;Model executable is up to date!
gG3;Start sampling
g
Running MCMC with 4 parallel chains...
Chain 1 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 2 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 3 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 4 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 4 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 3 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 1 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 2 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 2 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 4 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 3 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 1 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 4 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 2 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 3 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 1 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 4 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 1 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 2 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 3 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 4 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 2 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 3 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 1 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 4 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 2 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 3 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 1 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 4 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 3 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 2 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 1 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 4 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 3 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 2 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 1 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 4 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 3 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 2 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 1 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 4 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 4 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 3 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 3 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 2 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 2 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 1 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 1 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 4 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 1 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 3 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 2 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 1 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 4 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 1 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 4 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 3 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 2 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 1 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 4 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 1 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 3 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 4 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 2 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 1 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 4 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 1 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 3 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 2 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 4 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 1 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 3 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 1 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 4 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 2 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 1 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 1 finished in 92.8 seconds.
Chain 4 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 3 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 2 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 4 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 4 finished in 98.1 seconds.
Chain 3 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 2 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 3 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 2 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 3 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 2 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 3 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 3 finished in 118.7 seconds.
Chain 2 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 2 finished in 123.1 seconds.
All 4 chains finished successfully.
Mean chain execution time: 108.2 seconds.
Total execution time: 123.4 seconds.
# summary(m1)
# conditional_effects(m1)
# pp_check(m1)
# plot(m1)
Compare prior-only model to model with data using
priorsense package:
m1_draws <- as_draws_df(m1)
# Extract prior and posterior draws
# powerscale_sensitivity(m1, variable = c("b_Intercept",
# "b_sin_doy1", "b_cos_doy1",
# "b_sin_doy2", "b_cos_doy2",
# "b_sin_doy3", "b_cos_doy3",
# "sdgp_gpxy", "sigma", "Intercept"))
#
# powerscale_plot_dens(m1, variable = c("b_Intercept",
# "b_sin_doy1", "b_cos_doy1",
# "b_sin_doy2", "b_cos_doy2",
# "b_sin_doy3", "b_cos_doy3",
# "sdgp_gpxy", "sigma", "Intercept"))
Predictions by space and time. Here, although we only have data within clusters, we will predict for cells outside of clusters based on covariates. We will also predict for the month for whcih we do not have data.
set up the predictions matrix
#set_up prediction date frame
nd_m1 <- mw_100m_grid_sf %>%
sf::st_sf() %>%
mutate(x = st_coordinates(st_centroid(.))[, 1],
y = st_coordinates(st_centroid(.))[, 2]) %>%
st_drop_geometry() %>%
rename(pop_density = 1) %>%
mutate(pop_density_km2 = pop_density / 0.01)
G2;H2;Warningh: There were 2 warnings in `stopifnot()`.
The first warning was:
ℹ In argument: `x = st_coordinates(st_centroid(.))[, 1]`.
Caused by warning:
! st_centroid assumes attributes are constant over geometries
ℹ Run ]8;;ide:run:dplyr::last_dplyr_]8;;ide:run:warnings()warnings()]8;;dplyr::last_dplyr_]8;;ide:run:warnings()warnings()]8;;]8;; to see the 1 remaining warning.g
#get the first day of each month for prediction
first_days <- ymd(paste0("2020-", sprintf("%02d", 1:12), "-01"))
first_day_doy <- yday(first_days)
#calculate Fourier terms
first_day_seasonality <- tibble(
mean_doy = first_day_doy,
sin_doy1 = sin(2 * pi * mean_doy / 365.25),
cos_doy1 = cos(2 * pi * mean_doy / 365.25),
sin_doy2 = sin(4 * pi * mean_doy / 365.25),
cos_doy2 = cos(4 * pi * mean_doy / 365.25),
sin_doy3 = sin(6 * pi * mean_doy / 365.25),
cos_doy3 = cos(6 * pi * mean_doy / 365.25),
date = first_days
)
#expand grid for prediction
nd_m1 <- nd_m1 %>%
crossing(first_day_seasonality) %>%
mutate(building_coverage_pct = building_coverage_pct/100)
Get the spatiotemporal predictions for model 1
m1_sum <- get_st_predictions(model = m1, nd=nd_m1,
model_id = "m1",
out_dir = "epred_chunks_m1",
ndraws = 1000,
chunk_size = 10000,
who_pm25_limit = 25)
Processing chunk 1 of 33
Processing chunk 2 of 33
Processing chunk 3 of 33
Processing chunk 4 of 33
Processing chunk 5 of 33
Processing chunk 6 of 33
Processing chunk 7 of 33
Processing chunk 8 of 33
Processing chunk 9 of 33
Processing chunk 10 of 33
Processing chunk 11 of 33
Processing chunk 12 of 33
Processing chunk 13 of 33
Processing chunk 14 of 33
Processing chunk 15 of 33
Processing chunk 16 of 33
Processing chunk 17 of 33
Processing chunk 18 of 33
Processing chunk 19 of 33
Processing chunk 20 of 33
Processing chunk 21 of 33
Processing chunk 22 of 33
Processing chunk 23 of 33
Processing chunk 24 of 33
Processing chunk 25 of 33
Processing chunk 26 of 33
Processing chunk 27 of 33
Processing chunk 28 of 33
Processing chunk 29 of 33
Processing chunk 30 of 33
Processing chunk 31 of 33
Processing chunk 32 of 33
Processing chunk 33 of 33
Plot predictions for model 1
m1_plots <- plot_st_predictions(summary_df = m1_sum, cluster_sf = scale_72_clusters)
m1_plots %>% map(plot)
$mean_log_plot
$mean_plot
$sd_plot
$exceed_plot
Now just draw 50 random coordinates, and predict by time to see whether we captured the trend.
#sample coordinate points from the household dataset
#we will fix the small number sampled later!
set.seed(123)
sampled_points_m1 <- aq_model_data %>%
sample_n(50) %>%
select(x, y)
#generate DOY 0–365 and calculate Fourier terms
doy_grid <- tibble(doy = 0:365) %>%
mutate(
sin_doy1 = sin(2 * pi * doy / 365),
cos_doy1 = cos(2 * pi * doy / 365),
sin_doy2 = sin(4 * pi * doy / 365),
cos_doy2 = cos(4 * pi * doy / 365),
sin_doy3 = sin(6 * pi * doy / 365),
cos_doy3 = cos(6 * pi * doy / 365)
)
#expand grid across sampled locations
prediction_df_m1 <- sampled_points_m1 %>%
crossing(doy_grid)
#add predictions
preds_m1 <- add_epred_draws(object = m1, newdata = prediction_df_m1)
#summarise
preds_m1_sum <- preds_m1 %>%
ungroup() %>%
group_by(doy) %>%
summarise(.epred_mean = mean(.epred),
.lower = quantile(.epred, probs=0.025),
.upper = quantile(.epred, probs = 0.975))
#GEt the observed data
observed_data <- aq_model_data %>%
select(mean_doy, log_pm2_5)
#plot
preds_m1_sum %>%
ggplot() +
geom_ribbon(aes(x=doy, ymin=.lower, ymax = .upper), fill="steelblue", alpha=0.3) +
geom_line(aes(x=doy, y=.epred_mean)) +
geom_jitter(data = observed_data, aes(x = mean_doy, y = log_pm2_5),
color = "darkred", alpha = 0.6, width = 0.5, height = 0.0, size = 1.2) +
labs(title = "Model-estimated log(PM2.5) with empirical measurements",
subtitle = "Model predictions with 95% CrI and observed data points",
x = "Day of year",
y = "log(PM2.5)",
caption = "Modelled estimates restricted to within clusters") +
theme_ggdist() +
theme(panel.background = element_rect(colour = "grey78"),
legend.position = "none")
Now we include grid level covariates of distance to the road, population density, and building footprint percent, along witht the mean temp and huidity on the day of measurement (from the purple air monitors)
Again, priors to be fixed later
priors <- c(
prior(normal(3.4, 1), class = "Intercept"),
prior(normal(0,1), class = "b"),
prior(exponential(1), class = "sigma"),
prior(exponential(1), class = "sdgp"),
prior(normal(0, 1), class = "lscale", lb = 0)
)
m2 <- brm(
formula = log_pm2_5 ~ gp(x, y, k=15) +
s(mean_temp_c, k=5) +
s(mean_current_humidity, k=5) +
s(pop_density_km2, k=5) +
s(building_coverage_pct, k=5) +
s(dist_to_road_m, k=5) +
sin_doy1 + cos_doy1 +
sin_doy2 + cos_doy2 +
sin_doy3 + cos_doy3,
data = aq_model_data,
family = gaussian(),
prior = priors,
chains = 4, cores = 4,
backend = "cmdstanr"
)
G2;H2;Warningh: The global prior 'normal(0, 1)' of class 'lscale' will not be used in the model as all related coefficients have individual priors already. If you did not set those priors yourself, then maybe brms has assigned default priors. See ?set_prior and ?default_prior for more details.g
G3;Compiling Stan program...
g
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
G3;Start sampling
g
Running MCMC with 4 parallel chains...
Chain 1 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 2 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 3 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 4 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 2 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 3 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 1 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 4 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 3 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 2 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 4 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 1 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 2 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 3 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 4 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 1 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 2 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 3 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 4 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 1 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 2 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 3 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 4 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 1 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 2 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 3 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 4 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 1 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 2 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 3 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 4 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 1 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 2 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 3 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 4 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 1 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 2 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 4 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 3 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 1 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 2 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 2 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 4 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 4 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 3 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 3 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 2 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 1 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 1 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 2 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 4 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 3 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 1 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 2 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 2 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 4 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 3 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 1 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 2 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 2 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 4 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 1 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 3 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 2 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 4 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 2 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 1 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 3 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 2 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 4 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 2 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 2 finished in 156.7 seconds.
Chain 1 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 3 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 4 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 1 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 3 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 4 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 1 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 3 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 4 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 1 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 3 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 4 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 1 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 3 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 1 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 4 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 1 finished in 210.8 seconds.
Chain 4 finished in 210.8 seconds.
Chain 3 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 3 finished in 218.8 seconds.
All 4 chains finished successfully.
Mean chain execution time: 199.3 seconds.
Total execution time: 219.1 seconds.
G3;Warning: 8 of 4000 (0.0%) transitions ended with a divergence.
See https://mc-stan.org/misc/warnings for details.
g
summary(m2)
G2;H2;Warningh: There were 8 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmupg
Family: gaussian
Links: mu = identity; sigma = identity
Formula: log_pm2_5 ~ gp(x, y, k = 15) + s(mean_temp_c, k = 5) + s(mean_current_humidity, k = 5) + s(pop_density_km2, k = 5) + s(building_coverage_pct, k = 5) + s(dist_to_road_m, k = 5) + sin_doy1 + cos_doy1 + sin_doy2 + cos_doy2 + sin_doy3 + cos_doy3
Data: aq_model_data (Number of observations: 3363)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Smoothing Spline Hyperparameters:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sds(smean_temp_c_1) 1.74 0.88 0.72 3.96 1.00 2085 2320
sds(smean_current_humidity_1) 1.31 0.85 0.27 3.48 1.00 1563 1074
sds(spop_density_km2_1) 0.73 0.69 0.03 2.54 1.00 1892 2287
sds(sbuilding_coverage_pct_1) 0.69 0.62 0.03 2.28 1.00 1167 1800
sds(sdist_to_road_m_1) 0.33 0.37 0.01 1.33 1.00 2111 2766
Gaussian Process Hyperparameters:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sdgp(gpxy) 2.33 0.86 1.13 4.35 1.00 2698 2676
lscale(gpxy) 0.03 0.01 0.01 0.05 1.00 4910 3193
Regression Coefficients:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept 2.87 0.17 2.52 3.20 1.00 4302 2655
sin_doy1 -0.76 0.07 -0.89 -0.63 1.00 3242 3386
cos_doy1 -0.49 0.07 -0.63 -0.36 1.00 6380 3122
sin_doy2 0.09 0.05 -0.01 0.19 1.00 4416 3466
cos_doy2 -0.05 0.06 -0.16 0.07 1.00 3420 3100
sin_doy3 -0.17 0.04 -0.24 -0.10 1.00 4903 3054
cos_doy3 0.12 0.04 0.03 0.20 1.00 5240 2913
smean_temp_c_1 2.35 0.51 1.34 3.33 1.00 6263 2943
smean_current_humidity_1 1.21 0.38 0.45 1.95 1.00 5830 3264
spop_density_km2_1 0.23 0.56 -0.71 1.43 1.00 4434 2733
sbuilding_coverage_pct_1 0.41 0.32 -0.15 1.07 1.00 3499 3346
sdist_to_road_m_1 -0.22 0.35 -0.84 0.57 1.00 4041 3039
Further Distributional Parameters:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma 0.54 0.01 0.53 0.56 1.00 6747 2877
Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
conditional_effects(m2)
pp_check(m2)
G3;Using 10 posterior draws for ppc type 'dens_overlay' by default.
g
plot(m2)
NA
NA
# m2_draws <- as_draws_df(m2)
#
# # Extract prior and posterior draws
# powerscale_sensitivity(m2, variable = c("b_Intercept", "b_sin_doy1", "b_cos_doy1", "b_sin_doy2", "b_cos_doy2", "b_sin_doy3", "b_cos_doy3", "bs_smean_temp_c_1", "bs_smean_current_humidity_1", "bs_spop_density_km2_1", "bs_sbuilding_coverage_pct_1", "bs_sdist_to_road_m_1",
# "sds_smean_temp_c_1", "sds_smean_current_humidity_1", "sds_spop_density_km2_1", "sds_sbuilding_coverage_pct_1",
# "sds_sdist_to_road_m_1", "sdgp_gpxy", "lscale_gpxy", "sigma", "Intercept", "s_smean_temp_c_1[1]", "s_smean_temp_c_1[2]", "s_smean_temp_c_1[3]", "s_smean_current_humidity_1[1]", "s_smean_current_humidity_1[2]", "s_smean_current_humidity_1[3]", "s_spop_density_km2_1[1]",
# "s_spop_density_km2_1[2]", "s_spop_density_km2_1[3]", "s_sbuilding_coverage_pct_1[1]", "s_sbuilding_coverage_pct_1[2]",
# "s_sbuilding_coverage_pct_1[3]", "s_sdist_to_road_m_1[1]", "s_sdist_to_road_m_1[2]", "s_sdist_to_road_m_1[3]"))
#
# powerscale_plot_dens(m2, variable = c("b_Intercept", "b_sin_doy1", "b_cos_doy1", "b_sin_doy2", "b_cos_doy2", "b_sin_doy3", "b_cos_doy3", "bs_smean_temp_c_1", "bs_smean_current_humidity_1", "bs_spop_density_km2_1", "bs_sbuilding_coverage_pct_1", "bs_sdist_to_road_m_1",
# "sds_smean_temp_c_1", "sds_smean_current_humidity_1", "sds_spop_density_km2_1", "sds_sbuilding_coverage_pct_1",
# "sds_sdist_to_road_m_1", "sdgp_gpxy", "lscale_gpxy", "sigma", "Intercept", "s_smean_temp_c_1[1]", "s_smean_temp_c_1[2]", "s_smean_temp_c_1[3]", "s_smean_current_humidity_1[1]", "s_smean_current_humidity_1[2]", "s_smean_current_humidity_1[3]", "s_spop_density_km2_1[1]",
# "s_spop_density_km2_1[2]", "s_spop_density_km2_1[3]", "s_sbuilding_coverage_pct_1[1]", "s_sbuilding_coverage_pct_1[2]",
# "s_sbuilding_coverage_pct_1[3]", "s_sdist_to_road_m_1[1]", "s_sdist_to_road_m_1[2]", "s_sdist_to_road_m_1[3]"))
Prediction matrix for m2
nd_m2 <- nd_m1 %>%
Get predictions for model 2
m2_sum <- get_st_predictions(model = m2, nd=nd_m2, model_id = "m2", out_dir = "epred_chunks_m2", ndraws = 1000, chunk_size = 10000, who_pm25_limit = 25)
G1;H1;Errorh: object 'nd_m2' not found
Error during wrapup: not that many frames on the stack
Error: no more error handlers available (recursive errors?); invoking 'abort' restart
g
Plot predictions for model 2
m2_plots %>% map(plot)
$mean_log_plot
$mean_plot
$sd_plot
$exceed_plot